
import string, re
import error
from token import *
from keywords import keywords

def group(*choices): return '(' + '|'.join(choices) + ')'
def any(*choices): return group(*choices) + '*'
def maybe(*choices): return group(*choices) + '?'

Whitespace = r'[ \f\t]*'

Name = r'[a-zA-Z_][\w-]*'

Decnumber = r'[1-9]\d*'
Pointfloat = group(r'\d+\.\d*', r'\.\d+')
Number = group(Pointfloat, Decnumber)

embed = r'<.*' + group('>', r'\n')

#regex explanation:
# [">] match with " or >
# [^\n"<>\\]*  match with 0 or more characters as long its not \n " < > \
#(?:\\.[^\n"<>\\]*)* Used to escape characters. \" 0 or more 
ContStr = r'[">][^\n"<>\\]*(?:\\.[^\n"<>\\]*)*' + group('<', r'\n', r'"')
ContComment = r'/\*[^\n\*\\]*(?:\\.[^\n\*\\]*)*' + group("\*/", r'\n')
Symbol = r"'[^\n \f\t;,)(><]*"

preString = r'[^"\\]*(?:\\.[^"\n\\]*)*<'
endString = r'[^"\\]*(?:\\.[^"\n\\]*)*"'

end_string = re.compile(endString)
pre_string = re.compile(preString)

# continued xml comment from keyword comment. (comment which ends up in the xml)
xmlComment = r'comment[ ]*"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r'\n')
end_xmlComment = re.compile(r'[^"\\]*(?:\\.[^"\n\\]*)*"')

# continued comment
endcomment = re.compile(r'[^*\\]*(?:\\.[^*\\]*)*\*/')
# single comment line
Comment = r'//[^\r\n]*'

Operator = r"[+%&|^`=?!]"

Bracket = r'[][(){}]'
Special = group(r'\r?\n', r"[:;.,$@#/'<]")
Funny = group(Bracket, Special, Operator)

PseudoExtras = group(r'\\\r?\n', Comment, xmlComment )
PseudoToken = Whitespace + group(PseudoExtras, Number, Name,
                ContStr, ContComment,Symbol,Funny) #ContEmb, Funny)

pseudoprog = re.compile( PseudoToken )


def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
    print "%d,%d-%d,%d:\t%s\t%s" % \
        (srow, scol, erow, ecol, tok_name[type], repr(token))

def tokentostring(type, token, (srow, scol), (erow, ecol), line): # for testing
    return "%d,%d-%d,%d:\t%s\t%s" % \
        (srow, scol, erow, ecol, tok_name[type], repr(token))


def tokenize(readline, tokeneater=printtoken):
    """
    The tokenize() function accepts two parameters: one representing the
    input stream, and one providing an output mechanism for tokenize().

    The first parameter, readline, must be a callable object which provides
    the same interface as the readline() method of built-in file objects.
    Each call to the function should return one line of input as a string.

    The second parameter, tokeneater, must also be a callable object. It is
    called once for each token, with five arguments, corresponding to the
    tuples generated by generate_tokens().
    """
    for token_info in generate_tokens(readline):
        tokeneater(*token_info)

def generate_tokens(readline):
    """
    The generate_tokens() generator requires one argment, readline, which
    must be a callable object which provides the same interface as the
    readline() method of built-in file objects. Each call to the function
    should return one line of input as a string.  Alternately, readline
    can be a callable function terminating with StopIteration:
        readline = open(myfile).next    # Example of alternate readline

    The generator produces 5-tuples with these members: the token type; the
    token string; a 2-tuple (srow, scol) of ints specifying the row and
    column where the token begins in the source; a 2-tuple (erow, ecol) of
    ints specifying the row and column where the token ends in the source;
    and the line on which the token was found. The line passed is the
    logical line; continuation lines are included.
    """
    lnum = parenlev = 0
    namechars, numchars = string.ascii_letters + '_', '0123456789'
    contxmlc, contstr, contcomment =  '', '', ''
    contline = None
    postembed = False

    while 1:                                   # loop over lines in stream
        try :
            line = readline()
        except StopIteration:
            line = ''

        lnum = lnum + 1
        pos, max = 0, len(line)

        if contxmlc:
            if not line:
                raise error.TokenError("EOF multi-line xml comment", xmlcmntstart)
            if not contxmlc[0] == '"': # case:  comment \n " (so comment string starts on next line)
                if not line.strip()[0] == '"': 
                    raise error.TokenError("EOF multi-line xml comment", xmlcmntstart)
                start = line.find('"')
                xmlcmntstart = (lnum, start)
                contxmlc = line[start]
                contline = line
                line = line[start+1:]

            endmatch = end_xmlComment.match(line) 
            if endmatch:
                pos = end = endmatch.end(0)
                yield(STRING, contxmlc + line[:end],
                           xmlcmntstart, (lnum, end), contline + line)
                contxmlc = ''
                contline = None
            else:
                contxmlc = contxmlc + line
                contline = contline + line
                continue
        if contstr:                            # continued string
            if not line:
                raise error.TokenError("EOF in multi-line string", strstart)

            endmatch = pre_string.match(line) # pre string of an embedding
            if not endmatch:
                endmatch = end_string.match(line)
            if endmatch:
                pos = end = endmatch.end(0)
                if line[end-1] == '<':         # embedding
                    pos = end = end-1
                    stringType = PRESTRING if not postembed else MIDSTRING
                    yield (stringType, contstr + line[:end],
                            strstart, (lnum,end), contline + line)
                elif postembed:
                    yield (POSTSTRING, contstr + line[:end-1],
                            strstart, (lnum,end), contline + line)
                else:
                    yield (STRING, contstr + line[:end-1],
                           strstart, (lnum, end), contline + line)
                postembed = False
                contstr = ''
                contline = None
            else :
                contstr = contstr + line
                contline = contline + line
                continue
        elif contcomment:                       # continued comment
            if not line:
                raise error.TokenError("EOF in multi-line comment", commentstart)
            endmatch = endcomment.match(line)
            if endmatch:
                pos = end = endmatch.end(0)
                yield(COMMENT, contcomment + line[:end],
                           commentstart, (lnum, end), contline + line)
                contcomment = ''
                contline = None
            else:
                contcomment = contcomment + line
                contline = contline + line
                continue
        elif parenlev == 0 :  # new statement
            if not line: break
            column = 0

            if line[pos] in '\r\n':           # skip blank lines
                #yield (NL, line[pos:], (lnum, pos), (lnum, len(line)), line)
                continue
        else :                                # continued statement
            if not line:
                raise error.TokenError("EOF in multi-line statement", (0, 0))

        while pos < max:
            pseudomatch = pseudoprog.match(line, pos)
            if not pseudomatch:                            # scan for tokens
                yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos+1), line)
                pos = pos + 1
                continue

            start, end = pseudomatch.span(1)
            spos, epos, pos = (lnum, start), (lnum, end), end
            token, initial = line[start:end], line[start]

            if initial in numchars or \
               (initial == '.' and token != '.'):      # ordinary number
                yield (NUMBER, token, spos, epos, line)
            elif initial in '\r\n':
                break
            elif initial == '/' and token[:2] == '//': # comment
                yield (COMMENT, token, spos, epos, line)
                continue
            elif initial == '/' and token[:2] == '/*':
                if token[-1] == '\n':                  # multiline comment
                    contline = line
                    commentstart = (lnum, start)
                    contcomment = line[start:]
                    break
                else :                                 # ordinary comment
                    yield(COMMENT, token, spos, epos, line)
                    continue
            elif initial in "'":
                    yield (STRING, token[1:], spos, epos, line)
                    #pos = pos - 1 # Enable recognizeing endpoint stmnt ;
                    continue
            elif initial in '"' or initial in ">" :
                start = start + 1
                if initial in ">":
                    postembed = True
                    yield (EMBEND, token[0], spos, epos, line)
                if token[-1] == '\n':                  # continued string
                    strstart = (lnum, start)
                    contstr = line[start:]
                    contline = line
                    break
                elif token[-1] == '<':                 # embedding
                    stringType = PRESTRING if not postembed else MIDSTRING
                    yield(stringType, token[1:-1], spos, (lnum, pos), line)
                    pos = pos - 1 # Enable recognizeing start point embed.
                elif initial in ">":
                    postembed = False
                    yield (POSTSTRING, token[1:-1], spos, (lnum, pos), line)
                else:                                  # ordinary string
                    yield (STRING, token[1:-1], spos, epos, line)
            elif initial in "<":
                    yield (EMBSTRT, token, spos, epos, line)
            elif initial in namechars:                 # ordinary name
                # could be special xml comment case!
                if token.startswith('comment') and (
                        token.endswith('"') or token.endswith('\n')):
                    yield (KEYWORD, token[:7], spos, (lnum, start+7), line)
                    start = token.find('"')
                    if not start: start = 7
                    if token[-1] == '"':
                        yield (STRING, token[start:], (lnum,start), epos, line)
                    else:
                        xmlcmntstart = (lnum, start)
                        contxmlc = token[start:]
                        contline = line
                        break
                #keyword check
                elif keywords.has_key(token.upper()):
                    yield (KEYWORD, token, spos, epos, line)
                else:
                    yield (NAME, token, spos, epos, line)
            else:
                if initial in '([{': parenlev = parenlev + 1
                elif initial in ')]}': parenlev = parenlev - 1
                yield (OP, token, spos, epos, line)
    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')


if __name__ == '__main__':                     # testing
    import sys
    if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
    else : tokenize(sys.stdin.readline)
